# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
df = pd.read_csv('Life Expectancy Data.csv')
sns.heatmap(pd.isnull(df))
sns.countplot(x='Status',data=df)
df['Life expectancy ']=df['Life expectancy '].fillna(value=df['Life expectancy '].mean())
df['Adult Mortality']=df['Adult Mortality'].fillna(value=df['Adult Mortality'].mean())
def impute_Alcohol(cols):
al=cols[0]
sc=cols[1]
if pd.isnull(al):
if sc<=2.5:
return 4.0
elif 2.5<sc<=5.0:
return 1.5
elif 5.0<sc<=7.5:
return 2.5
elif 7.5<sc<=10.0:
return 3.0
elif 10.0<sc<=15:
return 4.0
elif sc>15:
return 10.0
else:
return al
df['Alcohol']=df[['Alcohol','Schooling']].apply(impute_Alcohol,axis=1)
df['Diphtheria ']=df['Diphtheria '].fillna(value=df['Diphtheria '].mean())
df['Alcohol']=df['Alcohol'].fillna(value=df['Alcohol'].mean())
df['Polio']=df['Polio'].fillna(value=df['Polio'].mean())
def impute_HepatatisB(cols):
hep=cols[0]
dip=cols[1]
if pd.isnull(hep):
if dip<=15:
return 75.0
elif 15<dip<=30:
return 20.0
elif 30<dip<=45:
return 38.0
elif 45<dip<=60:
return 43.0
elif 60<dip<=80:
return 63.0
elif dip>80:
return 88.4
else:
return hep
df['Hepatitis B']=df[['Hepatitis B','Diphtheria ']].apply(impute_HepatatisB,axis=1)
sns.scatterplot(x=df['Diphtheria '],y=df['Hepatitis B'])
def impute_BMI(c):
b=c[0]
l=c[1]
if pd.isnull(b):
if l<=50:
return 25.0
elif 50<l<=60:
return 25.0
elif 60<l<=70:
return 32.0
elif 70<l<=80:
return 46.8
elif 80<l<=100:
return 60.0
else:
return b
df[' BMI ']=df[[' BMI ','Life expectancy ']].apply(impute_BMI,axis=1)
def impute_Total_exp(c):
t=c[0]
a=c[1]
if pd.isnull(t):
if a<=2.5:
return 5.08
elif 2.5<a<=5.0:
return 6.0
elif 5.0<a<=10.0:
return 6.71
elif 10.0<a<=12.5:
return 6.9
elif a>12.5:
return 6.68
else:
return t
df['Total expenditure']=df[['Total expenditure','Alcohol']].apply(impute_Total_exp,axis=1)
def impute_GDP(c):
g=c[0]
p=c[1]
if pd.isnull(g):
if p<=1250:
return 1100.0
elif 1250<p<=2500:
return 1800.0
elif 2500<p<=3750:
return 2900.0
elif 3750<p<=7500:
return 3500.0
elif 7500<p<=8750:
return 4500.0
elif 8750<p<=10000:
return 5000.0
elif 10000<p<=11250:
return 5700.0
elif 11250<p<=12500:
return 7000.0
elif 12500<p<=15000:
return 8000.0
elif 15000<p<=17500:
return 9000.0
elif p>17500:
return 8500.0
else:
return g
df['GDP']=df[['GDP','percentage expenditure']].apply(impute_GDP,axis=1)
def impute_population(c):
p=c[0]
i=c[1]
if pd.isnull(p):
if i<=100:
return 0.19*((10)**9)
elif 100<i<=250:
return 0.18*((10)**9)
elif 250<i<=350:
return 0.02*((10)**9)
elif 350<i<=900:
return 0.1*((10)**9)
elif 900<i<=1100:
return 0.18*((10)**9)
elif 1100<i<=1250:
return 0.05*((10)**9)
elif 1250<i<=1500:
return 0.19*((10)**9)
elif 1500<i<=1750:
return 0.05*((10)**9)
elif i>1750:
return 0.1*((10)**9)
else:
return p
df['Population']=df[['Population','infant deaths']].apply(impute_population,axis=1)
def impute_Thin_1(c):
t=c[0]
b=c[1]
if pd.isnull(t):
if b<=10:
return 5.0
elif 10<b<=20:
return 10.0
elif 20<b<=30:
return 8.0
elif 30<b<=40:
return 6.0
elif 40<b<=50:
return 3.0
elif 50<b<=70:
return 4.0
elif b>70:
return 1.0
else:
return t
df[' thinness 1-19 years']=df[[' thinness 1-19 years',' BMI ']].apply(impute_Thin_1,axis=1)
def impute_Thin_1(c):
t=c[0]
b=c[1]
if pd.isnull(t):
if b<=10:
return 5.0
elif 10<b<=20:
return 10.0
elif 20<b<=30:
return 8.0
elif 30<b<=40:
return 6.0
elif 40<b<=50:
return 3.0
elif 50<b<=70:
return 4.0
elif b>70:
return 1.0
else:
return t
df[' thinness 5-9 years']=df[[' thinness 5-9 years',' BMI ']].apply(impute_Thin_1,axis=1)
def impute_Income(c):
i=c[0]
l=c[1]
if pd.isnull(i):
if l<=40:
return 0.4
elif 40<l<=50:
return 0.42
elif 50<l<=60:
return 0.402
elif 60<l<=70:
return 0.54
elif 70<l<=80:
return 0.71
elif l>80:
return 0.88
else:
return i
df['Income composition of resources']=df[['Income composition of resources','Life expectancy ']].apply(impute_Income,axis=1)
def impute_schooling(c):
s=c[0]
l=c[1]
if pd.isnull(s):
if l<= 40:
return 8.0
elif 40<l<=44:
return 7.5
elif 44<l<50:
return 8.1
elif 50<l<=60:
return 8.2
elif 60<l<=70:
return 10.5
elif 70<l<=80:
return 13.4
elif l>80:
return 16.5
else:
return s
df['Schooling']=df[['Schooling','Life expectancy ']].apply(impute_schooling,axis=1)
sns.heatmap(pd.isnull(df))
d1 = df
plt.figure(figsize=(15, 12))
sns.heatmap(df.corr(),center=0,annot=True)
sns.scatterplot(x=df['Life expectancy '],y=df['Adult Mortality'])
sns.scatterplot(x=df['Life expectancy '],y=df['infant deaths'])
Nhận xét:
df=d1
df=df[df['Year']==2014]
df=df[df['Life expectancy ']<=65]
df1=df.sort_values('Total expenditure',ascending=False).head(20)
fig=plt.figure(figsize=(20,8))
ax1 =plt.subplot(1,1,1)
plt.title("Top 15 Countries with the Highest Total expenditure in 2014 and its Life Expectancy",fontsize=22,fontweight="bold")
ax1.bar(df1['Country'],df1['Total expenditure'])
ax1.yaxis.grid(linestyle='-')
plt.ylabel("Total expenditure", fontsize= 18, fontweight="bold")
plt.xlabel("Country", fontsize=18, fontweight="bold")
plt.xticks(rotation=60, fontsize=15 )
ax2=ax1.twinx()
ax2.plot(df1['Country'],df1['Life expectancy '],color='red',marker='.',markersize=20)
plt.ylabel("Life expectancy", fontsize= 18, fontweight="bold")
df=d1
df=df[df['Year']==2014]
df=df[df['Life expectancy ']<=65]
df1=df.sort_values('percentage expenditure',ascending=False).head(20)
fig=plt.figure(figsize=(20,8))
ax1 =plt.subplot(1,1,1)
plt.title("Top 15 Countries with the Highest Percentage expenditure in 2014 and its Life Expectancy",fontsize=22,fontweight="bold")
ax1.bar(df1['Country'],df1['percentage expenditure'])
ax1.yaxis.grid(linestyle='-')
plt.ylabel("Percentage expenditure", fontsize= 18, fontweight="bold")
plt.xlabel("Country", fontsize=18, fontweight="bold")
plt.xticks(rotation=60, fontsize=15 )
ax2=ax1.twinx()
ax2.plot(df1['Country'],df1['Life expectancy '],color='red',marker='.',markersize=20)
plt.ylabel("Life expectancy", fontsize= 18, fontweight="bold")#percentage expenditure Total expenditure
df = d1
df=df[df['Year']==2015]
df1=df.sort_values('Population',ascending=False).head(15)
fig=plt.figure(figsize=(20,8))
ax1 = plt.subplot(1,1,1)
plt.title("Top 15 Countries with the Highest Population in 2015 and its Life Expectancy",fontsize=22,fontweight="bold")
ax1.bar(df1['Country'],df1['Population'])
plt.ylabel("Population", fontsize= 18, fontweight="bold")
plt.xlabel("Country", fontsize=18, fontweight="bold")
plt.xticks(rotation=45, fontsize=15 )
ax2=ax1.twinx()
ax2.plot(df1['Country'],df1['Life expectancy '],color='red',marker='.',markersize=20)
plt.ylabel("Life expectancy", fontsize= 18, fontweight="bold")
df = d1
df=df[df['Year']==2015]
df1=df.sort_values('Population').head(15)
fig=plt.figure(figsize=(20,8))
ax1 =plt.subplot(1,1,1)
plt.title("Top 15 Countries with the Lowest Population in 2015 and its Life Expectancy",fontsize=22,fontweight="bold")
ax1.bar(df1['Country'],df1['Population'])
plt.ylabel("Population", fontsize= 18, fontweight="bold")
plt.xlabel("Country", fontsize=18, fontweight="bold")
plt.xticks(rotation=45, fontsize=15 )
ax2=ax1.twinx()
ax2.plot(df1['Country'],df1['Life expectancy '],color='red',marker='.',markersize=20)
plt.ylabel("Life expectancy", fontsize= 18, fontweight="bold")
Data=d1
Data = Data.drop(Data[Data['Income composition of resources']==0].index)
sns.scatterplot(x= Data['Income composition of resources'], y= Data['Life expectancy '], hue= Data.Status)
plt.title("Correlation between two variables Income composition of resources and Life expectancy",fontsize=13,fontweight="bold")
plt.ylabel("Life Expectancy", fontsize= 13, fontweight="bold")
plt.xlabel("Income_composition_of_resources", fontsize=13, fontweight="bold")
plt.show()
import plotly.graph_objects as go
df=d1
life_year = df.groupby(by = ['Year', 'Status']).mean().reset_index()
Developed = life_year.loc[life_year['Status'] == 'Developed',:]
Developing = life_year.loc[life_year['Status'] == 'Developing',:]
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=Developing['Year'], y=Developing['Measles '], mode='lines',name='Developing', marker_color='#f075c2'))
fig1.add_trace(go.Scatter(x=Developed['Year'], y=Developed['Measles '],
mode='lines',
name='Developed',
marker_color='#28d2c2'))
fig1.update_layout(
height=500,
xaxis_title="Years",
yaxis_title='Measles',
title_text='Average Measles of Developing and Developed countries over the years')
fig1.show()
df=d1
life_year = df.groupby(by = ['Year', 'Status']).mean().reset_index()
Developed = life_year.loc[life_year['Status'] == 'Developed',:]
Developing = life_year.loc[life_year['Status'] == 'Developing',:]
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=Developing['Year'], y=Developing['Life expectancy '], mode='lines',name='Developing', marker_color='#f075c2'))
fig1.add_trace(go.Scatter(x=Developed['Year'], y=Developed['Life expectancy '],
mode='lines',
name='Developed',
marker_color='#28d2c2'))
fig1.update_layout(
height=500,
xaxis_title="Years",
yaxis_title='Life expectancy in age',
title_text='Average Life expectancy of Developing and Developed countries over the years')
fig1.show()